package com.webull.information.center.carwler.common.util.jsoup.prase_cn;

import java.util.Date;
import java.util.Locale;
import java.util.Optional;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import com.webull.framework.util.UtilDate;
import com.webull.information.center.carwler.common.model.NewsInformation;
import com.webull.information.center.carwler.common.util.jsoup.HtmlBodyPrase;
import com.webull.information.center.carwler.common.util.jsoup.JsoupPraseUtil;
import com.webull.information.center.common.constants.Constants;

/**
 * 新浪财经频道网页解析,
 * 
 * @author shimingjun
 * @date 2016年6月28日 下午12:48:17
 * @version 1.0
 * @since JDK 1.8
 */

public class Sina_CN_tech_HtmlPrase implements HtmlBodyPrase {
	protected final Logger logger = LogManager.getLogger(getClass());

	public static void processContext1(org.jsoup.nodes.Element body, NewsInformation info) {
		Optional.ofNullable(body.select("div#artibody img").first()).ifPresent(img0 -> {
			if (img0.tagName().equalsIgnoreCase("img") && ("" + img0.attr("src")).matches("(?i).*\\.gif$")) {
				img0.remove();
			}
		});
		StringBuilder context = new StringBuilder();
		// Optional.ofNullable(body.getElementById("artibody")).map(bodey0 ->
		// bodey0.children()).ifPresent(pes -> {
		Optional.ofNullable(body.getElementById("artibody")).map(bodey0 -> bodey0.childNodes()).ifPresent(pes -> {

			p: for (int i = 0; pes != null && i < pes.size(); i++) {
				Node n = pes.get(i);
				if (!(n instanceof Element)) {
					if (n instanceof TextNode) {
						context.append(StringUtils.stripToEmpty(n.outerHtml()));
					}
					continue p;
				}

				Element p = (Element) n;

				if ("span".equals(p.tagName())) {
					context.append(StringUtils.stripToEmpty(p.text()));
					continue p;
				}

				JsoupPraseUtil.trimParagraph(p);// 段落trim

				p.select("span.img_descr").forEach(span0 -> {
					if (span0 != null && span0.parent() != null)
						span0.remove();
				});
				JsoupPraseUtil.replaceWithText(p, "span");
				JsoupPraseUtil.replaceImgAbsSrc(p);
				if (p.hasClass("article-editor") //
						|| "style".equalsIgnoreCase(p.tagName()) //
						|| p.hasClass("moduleSingleImg01") //
						|| p.hasClass("hqimg_related") //
						|| p.hasClass("bottom-tool") //
						|| p.hasClass("fin_reference") // 基金投诉台
						|| p.hasClass("xb_new_finance_app") // app 广告
						|| !p.select("a[href*=guba.sina]").isEmpty() // 股吧链接
						|| p.hasClass("finance_app_zqtg")//
						|| ("font".equals(p.tagName()) && p.hasClass("otherContent_01"))//
						|| p.select("div.ct_hqimg").size() > 0//
						|| Optional.ofNullable(p.parent()).map(p0 -> p0.hasClass("finance_app_zqtg")).orElse(false) // 广告
						|| (!p.hasText() && p.children().isEmpty())) {
					// pes.remove(i);
					continue p;
				}
				info.setMainProxy(false);
				// 如果有多张图，使用第二张图
				JsoupPraseUtil.addMainPic(p, info, 2);

				// a在前面有判断,判断完后再重置为文字
				JsoupPraseUtil.replaceWithText(p, "a");
				JsoupPraseUtil.removeBackground(p);
				context.append(StringUtils.stripToEmpty(p.outerHtml()) + "<div style='height:19px;'></div>");
			}
			info.setContent(context.toString());

		});
	}

	@Override
	public void praseNewsInfo(org.jsoup.nodes.Document doc, NewsInformation info) {
		try {

			Element body = doc.getElementById("pl_main_content");
			if (body == null)
				return;
			Optional.ofNullable(body.select("h1#main_title").first())
					.map(title0 -> StringUtils.stripToNull(title0.ownText()))
					.ifPresent(title1 -> info.setTitle(title1));
			// time
			Optional.ofNullable(doc.select("#page-tools>span.time-source>span.titer").first())
					.map(titer0 -> StringUtils.stripToNull(titer0.ownText())).ifPresent(titer1 -> {
						// 2016年08月29日 08:50
						info.setPushTime(titer1);
						Date d = UtilDate.parse(titer1, Locale.CHINESE, 8, "yyyy年MM月dd日 HH:mm");
						if (d != null) {
							info.setNewsTime(d);
						}
					});
			// getSourceName
			Optional.ofNullable(doc.select("#page-tools>span[class*=source] a").first())//
					.map(titer0 -> StringUtils.stripToNull(titer0.ownText())).ifPresent(titer1 -> {
						// 新浪科技
						info.setSourceName(titer1);
					});

			processContext1(body, info);

			if (StringUtils.isBlank(info.getLanguage())) {
				info.setLanguage(Constants.lang_zh);
			}

			if (StringUtils.isBlank(info.getSourceName())) {
				info.setSourceName("新浪科技");
			}
		} catch (Exception e) {
			logger.warn(e);
		}
	}

	public static void main(String[] args) throws Exception {
		System.out.println("sdfsdfsdpnG".matches("(?i).*\\.png$"));
		String url2 = "http://tech.sina.com.cn/i/2016-08-29/doc-ifxvixer7386338.shtml";
		url2 = "http://tech.sina.com.cn/it/2016-08-27/doc-ifxvixer7337270.shtml";
		// url2 =
		// "http://tech.sina.com.cn/2016-08-28/doc-ifxvixer7380139.shtml";
		Connection connection = Jsoup.connect(url2)
				.userAgent(
						"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
				.header("x-client-data", RandomStringUtils.randomAlphanumeric(40));

		org.jsoup.nodes.Document doc = connection.timeout(10000).get();
		NewsInformation info = new NewsInformation();
		new Sina_CN_tech_HtmlPrase().praseNewsInfo(doc, info);
		System.out.println(info);
		// System.out.println(StringUtils.deleteWhitespace("dfsd sdfs 11"));
	}
}
